Ensembl ID ENSG00000187951 had an incorrect gene name assigned to it, so in the newest preprocessing pipeline it was corrected and removed during Filtering (since it was a lncRNA), but removing this gene from the expression dataset seems to have caused bigger changes in the resulting dataset that expected.
# Load raw data
datExpr = read.csv('./../Data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../Data/RNAseq_ASD_datMeta.csv')
datMeta = datMeta %>% mutate(Brain_Region = as.factor(Region)) %>%
mutate(Brain_lobe = ifelse(Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45'), 'Frontal',
ifelse(Brain_Region %in% c('BA3_1_2_5', 'BA7'), 'Parietal',
ifelse(Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22'), 'Temporal',
'Occipital')))) %>%
mutate(Batch = as.factor(gsub('/', '.', RNAExtractionBatch)),
Diagnosis = factor(Diagnosis_, levels=c('CTL','ASD'))) %>%
dplyr::select(-Diagnosis_)
# Filter to only keep Frontal Lobe samples
datMeta = datMeta %>% filter(Brain_lobe=='Frontal')
datExpr = datExpr %>% dplyr::select(which(colnames(datExpr) %in% paste0('X',gsub('-','_',datMeta$X))))
The gene has quite a high standard deviation given its mean expression value
plot_data = data.frame('ID' = rownames(datExpr), 'Mean' = rowMeans(datExpr)+1, 'SD' = apply(datExpr, 1, sd)+1,
'is_ENSG00000187951' = rownames(datExpr) == 'ENSG00000187951',
alpha = ifelse(rownames(datExpr) == 'ENSG00000187951', 1, 0.1),
color = ifelse(rownames(datExpr) == 'ENSG00000187951', '#006699', '#808080'))
plot_data %>% ggplot(aes(Mean, SD)) + geom_point(alpha = plot_data$alpha, color = plot_data$color) +
geom_abline(color='gray') + scale_x_log10() + scale_y_log10() + theme_minimal()
This high variance is caused by a single Sample, which has a very high and uncharacteristic level of expression when comparing it to the level of expression this gene has in the other samples
plot_data = data.frame('ID' = colnames(datExpr), 'meanExpr' = colMeans(datExpr), 'geneExpr' = t(datExpr[rownames(datExpr) == 'ENSG00000187951',]), 'Diagnosis' = datMeta$Diagnosis)
summary(plot_data$ENSG00000187951)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.00 17.25 24.50 87.41 28.75 1460.00
ggplotly(plot_data %>% ggplot(aes(meanExpr, ENSG00000187951, color=Diagnosis)) + geom_point(alpha=0.8) +
theme_minimal() + xlab('Mean expression by Sample') + ylab('Expression of ENSG00000187951 by Sample'))
This may be causing changes in the resulting level of expression of the genes when performing the vst transformation to stabilise variance in the dataset
The difference is quite small, but it seems to have linearly lowered the mean level of expression of the genes, affecting the genes with the lowest levels of expression the most
# Original preprocessing
load('./../Data/preprocessed_data_old.RData')
datExpr_before = datExpr %>% data.frame %>% dplyr::select(which(colnames(datExpr) %in% paste0('X',gsub('-','_',datMeta$X))))
# New preprocessing
load('./../Data/preprocessed_data.RData')
datExpr_now = datExpr %>% data.frame
plot_data = data.frame('MeanExpr_before' = rowMeans(datExpr_before)[rownames(datExpr_before)!='ENSG00000187951'],
'MeanExpr_after' = rowMeans(datExpr_now)) %>%
mutate(Diff = MeanExpr_before - MeanExpr_after)
summary(plot_data$Diff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.051e-02 -7.520e-03 -2.843e-03 -5.210e-03 -1.164e-03 -1.114e-05
plot_data %>% ggplot(aes(MeanExpr_before, MeanExpr_after, color=Diff)) + geom_point(alpha=0.1) +
geom_abline(color='gray') + xlab('Mean Level of Expression Before') +
ylab('Mean Level of Expression Now') + coord_fixed() + scale_color_viridis() +
theme_minimal()
rm(datExpr, plot_data)
getModuleDiagCor = function(dataset, datExpr){
datTraits = datMeta %>% dplyr::select(Diagnosis, Sex, Age, PMI, RNAExtractionBatch) %>%
dplyr::rename('ExtractionBatch' = RNAExtractionBatch)
ME_object = datExpr %>% t %>% moduleEigengenes(colors = dataset$Module)
MEs = orderMEs(ME_object$eigengenes)
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits)$correlations[1,-1]) %>% t
moduleDiagCor = moduleTraitCor[,1]
if(sum(!complete.cases(moduleDiagCor))>0){
for(m in names(moduleDiagCor)){
if(is.na(moduleDiagCor[m]))
moduleDiagCor[m] = polyserial(MEs[,m], datMeta$Diagnosis)
}
}
return(moduleDiagCor)
}
clustering_selected = 'DynamicHybridMergedSmall'
# Before
dataset_before = read.csv(paste0('./../Data/dataset_', clustering_selected, '_old.csv'))
dataset_before$Module = dataset_before[,clustering_selected]
moduleDiagCor_before = getModuleDiagCor(dataset_before, datExpr_before)
# Now
dataset_now = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'))
dataset_now$Module = dataset_now[,clustering_selected]
moduleDiagCor_now = getModuleDiagCor(dataset_now, datExpr_now)
rm(getModuleDiagCor, clustering_selected)
module = dataset_before$Module[dataset_before$ID=='ENSG00000187951']
cat(paste0('Gene ENSG00000187951 belonged to Module ', module, ' which had a Module-Diagnosis correlation of ',
round(moduleDiagCor_before[module][[1]],2)))
## Gene ENSG00000187951 belonged to Module #00BF7A which had a Module-Diagnosis correlation of -0.51
rm(module)
top_modules_before = gsub('ME','',names(moduleDiagCor_before)[abs(moduleDiagCor_before)>0.9])
top_modules_now = gsub('ME','',names(moduleDiagCor_now)[abs(moduleDiagCor_now)>0.9])
genes_top_modules_memberships = data.frame('ID' = rownames(datExpr_now))
for(tm in top_modules_before){
genes_top_modules_memberships[,tm] = dataset_before$Module[dataset_before$ID!='ENSG00000187951']==tm
}
for(tm in top_modules_now){
genes_top_modules_memberships[,tm] = dataset_now$Module==tm
}
cat('Top Modules sizes before:')
## Top Modules sizes before:
colSums(genes_top_modules_memberships[,1+(1:length(top_modules_before))])
## #FE6E8A #7DAE00 #E08B00
## 32 149 223
cat('Top Modules sizes now:')
## Top Modules sizes now:
colSums(genes_top_modules_memberships[,(2+length(top_modules_before)):ncol(genes_top_modules_memberships)])
## #D873FC #D69100
## 23 891
Modules with positive correlation to Diagnosis
pos_corr = genes_top_modules_memberships %>%
dplyr::select(c(gsub('ME','',names(moduleDiagCor_before)[moduleDiagCor_before>0.9]),
gsub('ME','',names(moduleDiagCor_now)[moduleDiagCor_now>0.9])))
grid.newpage()
grid.draw(draw.triple.venn(sum(pos_corr[,1]), sum(pos_corr[,2]), sum(pos_corr[,3]),
sum(pos_corr[,1]*pos_corr[,2]), sum(pos_corr[,2]*pos_corr[,3]), sum(pos_corr[,1]*pos_corr[,3]),
sum(pos_corr[,1]*pos_corr[,2]*pos_corr[,3]),
category = paste0(colnames(pos_corr), c(' (Before)', ' (Before)', ' (Now)')),
fill = colnames(pos_corr), fontfamily = rep('sans-serif',7),
alpha = rep(0.25,3), lty = rep('blank', 3), cat.fontfamily = rep('sans-serif',3)))
cat(paste0('We recover ', sum(pos_corr[,1] + pos_corr[,2] & pos_corr[,3]>0), '/',
sum(pos_corr[,1]+pos_corr[,2]), ' of the genes found in the original Module (',
round(100*sum(pos_corr[,1] + pos_corr[,2] & pos_corr[,3]>0)/(sum(pos_corr[,1]+pos_corr[,2]))),'%)'))
## We recover 206/372 of the genes found in the original Module (55%)
Modules with negative correlation to Diagnosis
neg_corr = genes_top_modules_memberships %>%
dplyr::select(c(gsub('ME','',names(moduleDiagCor_before)[moduleDiagCor_before < -0.9]),
gsub('ME','',names(moduleDiagCor_now)[moduleDiagCor_now < -0.9])))
grid.newpage()
grid.draw(draw.pairwise.venn(sum(neg_corr[,1]), sum(neg_corr[,2]),sum(neg_corr[,1]*neg_corr[,2]),
category = paste0(colnames(neg_corr), c(' (Before)', ' (Now)')),
fill = colnames(neg_corr), fontfamily = rep('sans-serif',3),
alpha = rep(0.25,2), lty = rep('blank', 2), cat.fontfamily = rep('sans-serif',2)))
cat(paste0('We recover ', sum(neg_corr[,1] & neg_corr[,2]>0), '/',
sum(neg_corr[,1]), ' of the genes found in the original Module (',
round(100*sum(neg_corr[,1] & neg_corr[,2]>0)/sum(neg_corr[,1])),'%)'))
## We recover 21/32 of the genes found in the original Module (66%)
# Three Modules
# grid.newpage()
# grid.draw(draw.triple.venn(sum(neg_corr[,1]), sum(neg_corr[,2]), sum(neg_corr[,3]),
# sum(neg_corr[,1]*neg_corr[,2]), sum(neg_corr[,2]*neg_corr[,3]), sum(neg_corr[,1]*neg_corr[,3]),
# sum(neg_corr[,1]*neg_corr[,2]*neg_corr[,3]),
# category = paste0(colnames(neg_corr), c(' (Before)', ' (Now)', ' (Now)')),
# fill = colnames(neg_corr), fontfamily = rep('sans-serif',7),
# alpha = rep(0.25,3), lty = rep('blank', 3), cat.fontfamily = rep('sans-serif',3)))
#
# cat(paste0('We recover ', sum(neg_corr[,1] & neg_corr[,2]+neg_corr[,3]>0), '/',
# sum(neg_corr[,1]), ' of the genes found in the original Module (',
# round(100*sum(neg_corr[,1] & neg_corr[,2]+neg_corr[,3]>0)/sum(neg_corr[,1])),'%)'))